'''
In this code, the target query is first converted to its embedding using a BERT model and then the cosine similarity between this embedding and all embeddings in the dataset is computed. 
The sentences with similarity larger than threshold are selected and printed.
'''


import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer

directory = '<Path of data.pkl file>'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
similarity_threshold = 0.8

dataset = pd.read_pickle(directory)
all_data = np.stack(dataset["embedding"].values)

target_sentence = "sample sentence."

model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
embeddings = model.encode([target_sentence], convert_to_numpy=True, device=device)

cosine_similarities = cosine_similarity(all_data, embeddings).ravel()
indices = np.where(cosine_similarities > similarity_threshold)[0]

print(f"Seraching for similar query as \n{target_sentence}\n\n")
print("Results:")
for idx in indices:
    print('_'*30)
    print(dataset.loc[idx,'document_id'])
    print(dataset.loc[idx,'sentence'])
    